R Markdown
# Required libraries
library(dplyr)
library(Matrix)
library(Seurat)
library(cowplot)
library(ggplot2)
library(reticulate)
set.seed(666)
# Set up file names and conditions
m_files = c("data/reads.Sample1_MCSF_R1_001.fastq_bq10_star_corrected.umi.dge.txt", "data/mcsf_day6_1.txt", "data/mcsf_day6_2.txt")
m_donors = c("2", "1a", "1a")
g_files = c("data/gmcsf_day6_1.txt", "data/gmcsf_day6_2.txt", "data/reads.Sample2_gMCSF_R1_001.fastq_bq10_star_corrected.umi.dge.txt")
g_donors = c("1a", "1a", "2")
files = c(m_files, g_files)
donornames = c(m_donors, g_donors)
stimnames = c(rep("M", length(m_donors)), rep("G", length(g_donors)))
# Returns Seurat Object with donor information and stimulation condition as metadata
loadData <- function(filename, stimname, donorname) {
raw_counts <- read.table(file = filename, header = TRUE, row.names = 1, sep = "\t", stringsAsFactors = FALSE)
raw <- CreateSeuratObject(counts = raw_counts, project = "mcsf-gmcsf", min.cells = 3, min.features = 200)
raw$stim <- stimname
raw$donor <- donorname
Idents(raw) <- "stim"
return(raw)
}
performQC <- function(raw, plot = FALSE) {
# Calculate percent mitochondrial reads
mito.genes <- grep(pattern = "^MT-", x = rownames(x = GetAssayData(object = raw)), value = TRUE)
raw[["percent.mt"]] <- (colSums(GetAssayData(object = raw, slot = "counts")[mito.genes, ])/colSums(GetAssayData(object = raw,
slot = "counts")) * 100)
if (plot) {
scatter <- FeatureScatter(object = raw, feature1 = "nCount_RNA", feature2 = "nFeature_RNA", group.by = "percent.mt")
print(scatter)
# Count QC
counts = raw$nCount_RNA
cts <- qplot(counts, geom = "histogram", bins = 100)
cts_lo <- qplot(counts[counts < 5000], geom = "histogram", bins = 100)
cts_hi <- qplot(counts[counts > 10000], geom = "histogram", bins = 100)
cts_grid <- plot_grid(cts, cts_lo, cts_hi)
print(cts_grid)
# Gene QC
genes = raw$nFeature_RNA
gen <- qplot(genes, geom = "histogram", bins = 100)
gen_lo <- qplot(genes[genes < 1000], geom = "histogram", bins = 100)
gen_grid <- plot_grid(gen, gen_lo)
print(gen_grid)
vln <- VlnPlot(object = raw, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), group.by = "stim", ncol = 3)
print(vln)
}
# Filter
raw <- subset(raw, subset = nCount_RNA > 2500 & nCount_RNA < 75000 & percent.mt < 20)
raw <- NormalizeData(raw, verbose = FALSE)
raw <- FindVariableFeatures(raw, selection.method = "vst", nfeatures = 4000)
return(raw)
}
scaleData <- function(raw) {
raw <- ScaleData(raw, vars.to.regress = c("nUMI", "percent.mt"), verbose = FALSE)
raw <- RunPCA(raw, features = VariableFeatures(object = raw), npcs = 100, verbose = FALSE)
p2 <- DimPlot(raw)
# print(p2)
return(raw)
}
# Load the datasets
alldata <- lapply(1:length(files), function(i) i)
mergeruns <- lapply(1:(length(unique(g_donors)) + length(unique(m_donors))), function(i) i)
ctr = 0
for (i in seq_along(files)) {
alldata[[i]] <- loadData(files[i], stimnames[i], donornames[i])
# merge runs with the same donor and stimulation
if (i == 1 || donornames[i - 1] != donornames[i] || stimnames[i - 1] != stimnames[i]) {
if (ctr > 0) {
# cellCycle(mergeruns[[ctr]])
mergeruns[[ctr]] <- performQC(mergeruns[[ctr]], plot = TRUE)
mergeruns[[ctr]] <- scaleData(mergeruns[[ctr]])
}
ctr = ctr + 1
mergeruns[[ctr]] <- alldata[[i]]
} else {
mergeruns[[ctr]] <- merge(mergeruns[[ctr]], alldata[[i]])
}
}












mergeruns[[length(mergeruns)]] <- performQC(mergeruns[[length(mergeruns)]], plot = TRUE)




scaleData(mergeruns[[length(mergeruns)]])
## An object of class Seurat
## 16590 features across 1081 samples within 1 assay
## Active assay: RNA (16590 features)
## 1 dimensional reduction calculated: pca
# Merge all data
merged <- merge(alldata[[1]], y = alldata[2:6])
merged <- performQC(merged, plot = TRUE)




Perform integration
# returns
integrate <- function(list, dims = 50) {
k.filter = min(200, sapply(list, ncol))
list.anchors <- FindIntegrationAnchors(object.list = list, dims = 1:dims, k.filter = k.filter)
list.combined <- IntegrateData(anchorset = list.anchors, dims = 1:dims)
DefaultAssay(list.combined) <- "integrated"
return(list.combined)
}
splitdonor <- SplitObject(merged, split.by = "donor")
splitstim <- SplitObject(merged, split.by = "stim")
donor.combined <- integrate(splitdonor)
stim.combined <- integrate(splitstim)
runs.combined <- integrate(mergeruns)
cluster <- function(data, npcs = 30, resolution = 0.5) {
data <- ScaleData(data) #, vars.to.regress = c('nUMI', 'percent.mt', 'CC.Difference'), verbose = FALSE)
data <- RunPCA(data, npcs = 50, verbose = FALSE)
# t-SNE and Clustering
data <- RunUMAP(data, reduction = "pca", dims = 1:npcs)
data <- RunTSNE(data, reduction = "pca", dims = 1:npcs)
data <- FindNeighbors(data, reduction = "pca", dims = 1:npcs)
data <- FindClusters(data, resolution = resolution)
return(data)
}
splitstim <- cluster(stim.combined)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6449
## Number of edges: 260790
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8389
## Number of communities: 8
## Elapsed time: 2 seconds
merged <- cluster(merged)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6449
## Number of edges: 256859
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8699
## Number of communities: 8
## Elapsed time: 1 seconds
splitruns <- cluster(runs.combined)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6449
## Number of edges: 322533
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7907
## Number of communities: 9
## Elapsed time: 1 seconds
splitdonor <- cluster(donor.combined)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 6449
## Number of edges: 255621
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8290
## Number of communities: 8
## Elapsed time: 1 seconds
# Visualization
vizClust <- function(data, reduction = "umap") {
p1 <- DimPlot(data, reduction = reduction, split.by = "stim")
p2 <- DimPlot(data, reduction = reduction, split.by = "donor")
p3 <- DimPlot(data, reduction = reduction, label = TRUE)
print(plot_grid(p1, p2, p3, ncol = 1))
}
vizClust(merged)

vizClust(splitdonor)

vizClust(splitstim)

vizClust(splitruns)

markers <- function(data) {
DefaultAssay(data) <- "RNA"
Idents(data) <- "stim"
avg.exp <- log1p(AverageExpression(data, verbose = FALSE)$RNA)
avg.exp$gene <- rownames(avg.exp)
p1 <- ggplot(avg.exp, aes(M, G)) + geom_point() + ggtitle("Mph")
# print(p1)
roc.markers <- FindMarkers(data, ident.1 = "G", ident.2 = "M", test.use = "roc", verbose = FALSE)
wilcox.markers <- FindMarkers(data, ident.1 = "G", ident.2 = "M", test.use = "wilcox", verbose = FALSE)
g.response = merge(roc.markers, wilcox.markers, by = 0, all = TRUE)
print(head(g.response, n = 15))
return(g.response)
}
DefaultAssay(splitruns) <- "RNA"
splitruns.markers <- markers(splitruns)
## Row.names myAUC avg_diff power pct.1.x pct.2.x p_val
## 1 ABCG1 0.616 0.2778750 0.232 0.279 0.049 6.148143e-150
## 2 ACO1 0.631 0.2807387 0.262 0.832 0.687 5.600065e-58
## 3 ACOT7 0.629 0.3210459 0.258 0.716 0.521 5.010557e-60
## 4 ADAM10 0.378 -0.3282016 0.244 0.806 0.862 1.065538e-49
## 5 ADAM15 0.639 0.2531354 0.278 0.599 0.333 2.272680e-80
## 6 ADAMTSL4 0.672 0.2853401 0.344 0.403 0.063 6.870230e-246
## 7 ADAP2 0.387 -0.3132631 0.226 0.558 0.633 1.657069e-45
## 8 AFAP1L1 0.356 -0.4856171 0.288 0.361 0.581 4.321714e-76
## 9 AGPAT9 0.337 -0.3963841 0.326 0.760 0.849 6.457326e-87
## 10 AKR1B1 0.701 0.5226678 0.402 0.778 0.497 4.659978e-142
## 11 ALAS1 0.685 0.4707378 0.370 0.941 0.855 6.903546e-112
## 12 ALCAM 0.336 -0.3640073 0.328 0.923 0.972 4.744433e-88
## 13 ALDH1A2 0.708 0.8786520 0.416 0.621 0.258 1.215774e-192
## 14 ALDH2 0.658 0.4241406 0.316 0.826 0.662 7.648748e-84
## 15 ALOX5AP 0.808 0.8427211 0.616 0.945 0.733 1.198178e-307
## avg_logFC pct.1.y pct.2.y p_val_adj
## 1 0.2778750 0.279 0.049 1.202392e-145
## 2 0.2807387 0.832 0.687 1.095205e-53
## 3 0.3210459 0.716 0.521 9.799147e-56
## 4 -0.3282016 0.806 0.862 2.083872e-45
## 5 0.2531354 0.599 0.333 4.444680e-76
## 6 0.2853401 0.403 0.063 1.343611e-241
## 7 -0.3132631 0.558 0.633 3.240730e-41
## 8 -0.4856171 0.361 0.581 8.451975e-72
## 9 -0.3963841 0.760 0.849 1.262859e-82
## 10 0.5226678 0.778 0.497 9.113519e-138
## 11 0.4707378 0.941 0.855 1.350127e-107
## 12 -0.3640073 0.923 0.972 9.278687e-84
## 13 0.8786520 0.621 0.258 2.377690e-188
## 14 0.4241406 0.826 0.662 1.495866e-79
## 15 0.8427211 0.945 0.733 2.343277e-303
markers <- splitruns.markers[splitruns.markers$myAUC > 0.7, ]
markers <- c(markers$Row.names)
print(markers)
## [1] "AKR1B1" "ALDH1A2" "ALOX5AP" "B2M" "C19orf59" "CD44"
## [7] "CD74" "CLEC7A" "CYBB" "CYP1B1" "FAM129A" "FKBP5"
## [13] "FTH1" "GLRX" "HLA-DPA1" "HLA-DRA" "HLA-DRB1" "LYZ"
## [19] "METTL9" "MOB3B" "MRC1L1" "PRDX1" "PTAFR" "SDCBP"
## [25] "SLC7A11" "SORT1" "STAC" "TMSB10" "TPT1" "TXN"
## [31] "VDR"
plot.stim <- VlnPlot(splitruns, features = markers, group.by = "stim", pt.size = 0.01, combine = TRUE)
print(plot.stim)
